home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Tech Arsenal 1
/
Tech Arsenal (Arsenal Computer).ISO
/
tek-01
/
us20src.zip
/
BIC.C
< prev
next >
Wrap
C/C++ Source or Header
|
1992-06-27
|
11KB
|
578 lines
/* BIC: Bickel's Algorithm test program
Spell Checker and Corrector
(C)opyright May 1990, 1991 by Daniel Lawrence
All Rights Reserved
Revision History:
29-mar-91 Trying some filters to improve this
*/
#define maindef 1
#include <stdio.h>
#include <ctype.h>
#include "dopt.h"
#include "dstruct.h"
#include "ddef.h"
#define MWSIZE 40 /* maximum # chars in a word */
/* letter weighing table */
char lweight[26] = {
/* a */ 3,
/* b */ 6,
/* c */ 5,
/* d */ 4,
/* e */ 3,
/* f */ 5,
/* g */ 5,
/* h */ 4,
/* i */ 3,
/* j */ 8,
/* k */ 7,
/* l */ 4,
/* m */ 5,
/* n */ 3,
/* o */ 3,
/* p */ 5,
/* q */ 7,
/* r */ 4,
/* s */ 3,
/* t */ 3,
/* u */ 4,
/* v */ 6,
/* w */ 5,
/* x */ 8,
/* y */ 8,
/* z */ 9
};
/* some limits */
#define MAXMATCHES 50
/* local prototypes */
#if TURBO
char *cleanword(char *word);
char *sortword(char *word);
char *intersect(char *word1, char *word2);
int weight(char *cletters);
int mismatch(char *, char *);
#else
char *cleanword();
char *sortword();
char *intersect();
int weight();
int mismatch();
#endif
/* GLOBALS for BIC */
char testwrd[MWSIZE]; /* word to test against dictionary */
char sortwrd[MWSIZE]; /* sorted letters of testwrd */
char dictwrd[MWSIZE]; /* dictionary word to compare */
int likeness; /* likeness score of current word */
int threshold; /* threshold to display likeness */
int max_matches; /* maximum number of matches */
char matches[MAXMATCHES][128]; /* list of current matches */
int match_scores[MAXMATCHES]; /* difference scores for these */
int swterse = FALSE; /* called from EMACS? */
int swverbose = FALSE; /* display extra info? */
int exact_match = FALSE; /* is this word in the dictionary? */
main(argc, argv)
int argc; /* command line argument count */
char **argv; /* argument vector */
{
register int dif_value; /* difference value of current word */
register char *sortptr; /* ptr to the sorted cleaned dictionary word */
register char first_char; /* first character in word to test */
max_matches = 12; /* default list length! */
if (argc < 2) {
usage();
exit(EXBADOPT);
}
/* deal with the command options */
while (argc > 1) {
/* grab the next option */
--argc;
argv++;
strcpy(testwrd, argv[0]);
if (*testwrd == '-') {
switch (testwrd[1]) {
case 'm':
--argc;
argv++;
max_matches = atoi(argv[0]);
break;
case 't':
swterse = TRUE;
break;
case 'v':
swverbose = TRUE;
break;
default:
usage();
exit(EXBADOPT);
}
}
}
/* announce us */
if (swterse == FALSE)
printf("BIC %s Bickel's Test Algorithm\n", VERSION);
/* initialize the lcase array */
init_lcase();
/* grab the test word */
strcpy(sortwrd, sortword(cleanword(testwrd)));
threshold = weight(intersect(sortwrd, sortwrd)) - 4;
if (swverbose)
printf("threshold = %u\n", threshold);
/* initailize the list of matches */
init_match();
/* open the main dictionary */
if (mopen() == FALSE)
exit(EXMDICT);
first_char = lcase[*testwrd];
skipdict(first_char);
strcpy(dictwrd, nxtmword());
/* scan the dictionary */
while (lcase[*dictwrd] == first_char) {
/* dump them if off by more than 3 chars! */
if (abs(strlen(dictwrd) - strlen(testwrd)) < 4) {
/* clean and sort the word */
sortptr = sortword(cleanword(dictwrd));
/* calculate the likeness score */
likeness = weight(intersect(sortwrd, sortptr));
/* tell us about it */
if (likeness > threshold) {
/* are these nearly alike? */
if ((dif_value = mismatch(sortwrd, sortptr)) < 20) {
add_match(dictwrd, dif_value);
/* is this the word? */
if (strcmp(testwrd, dictwrd) == 0) {
exact_match = TRUE;
if (swterse) {
printf("EXACT MATCH\n");
mclose();
exit(EXGOOD);
}
}
}
}
}
/* get a dictionary word */
nextword: strcpy(dictwrd, nxtmword());
/* at the end... skip out */
if (strcmp(dictwrd, hivalue) == 0)
break;
}
/* close the main dictionary */
mclose();
/* show us the results */
show_match();
/* error code of EXMISS if word is not in dictionary */
if (exact_match) {
exit(EXGOOD);
} else {
exit(EXMISS);
}
}
usage() /* print the command line usage */
{
printf("BIC: Bickel's Algorithm Test Program\n");
printf(" for MicroSPELL %s\n",VERSION);
puts("\nUsage\n");
puts(" BIC {-t|-v} {-m <n>} <test word>\n");
puts(" -m <n> return top <n> suggestions / default = 12");
puts(" -t terse mode");
puts(" -v verbose mode");
}
#if RAMSIZE & LATTICE & MSDOS
/* These routines will allow me to track memory usage by placing
a layer on top of the standard system malloc() and free() calls.
with this code defined, the number of allocated bytes is displayed
in the upper right corner of the screen
*/
#undef malloc
#undef free
char *allocate(nbytes) /* allocate nbytes and track */
unsigned nbytes; /* # of bytes to allocate */
{
char *mp; /* ptr returned from malloc */
char *malloc();
mp = malloc(nbytes);
if (mp) {
envram += nbytes;
#if RAMSHOW
dspram();
#endif
}
return(mp);
}
release(mp) /* release malloced memory and track */
char *mp; /* chunk of RAM to release */
{
unsigned *lp; /* ptr to the long containing the block size */
if (mp) {
lp = ((unsigned *)mp) - 1;
/* update amount of ram currently malloced */
envram -= (long)*lp - 2;
free(mp);
#if RAMSHOW
dspram();
#endif
}
}
#if RAMSHOW
dspram() /* display the amount of RAM currently malloced */
{
char mbuf[20];
char *sp;
/* TTmove(term.t_nrow - 1, 70);*/
sprintf(mbuf, "[%lu]", envram);
sp = &mbuf[0];
puts(sp);
}
#endif
#endif
#if AZTEC & MSDOS
#undef fgetc
/* a1getc: Get an ascii char from the file input stream
but DO NOT strip the high bit
*/
int a1getc(fp)
FILE *fp;
{
int c; /* translated character */
c = getc(fp); /* get the character */
/* if its a <LF> char, throw it out */
while (c == 10)
c = getc(fp);
/* if its a <RETURN> char, change it to a LF */
if (c == '\r')
c = '\n';
/* if its a ^Z, its an EOF */
if (c == 26)
c = EOF;
return(c);
}
#endif
#if CMS
#undef fopen
/* The IBM 30xx likes to tell us when file opens
fail...it's too chatty....I like to handle these myself */
FILE *cmsopen(file, mode)
char *file; /* name of file to open */
char *mode; /* mode to open it in */
{
quiet(1);
return(fopen(file,mode));
}
#endif
/**********************************************************************/
/* clean word: dump all the non alpha chars, and make them
lowercase only
*/
char *cleanword(word)
char *word; /* word to clean */
{
register char *sp; /* ptr into return buffer */
static char rbuf[MWSIZE]; /* return buffer */
/* scan through the word */
sp = rbuf;
while (*word) {
if (isalpha(*word)) {
if (islower(*word))
*sp++ = *word;
else
*sp++ = *word + 32;
}
++word;
}
*sp = 0;
return(rbuf);
}
/* Sort word: bubble sort the letters in a word....
average word size is 4.5 chars. Bubble sort
is most efficient at that size
*/
char *sortword(word)
char *word; /* word to sort letters in */
{
register int i, j; /* bubble sort indexes */
register int temp; /* temp for swap */
register int swapflag; /* did a swap occur? */
static char sword[MWSIZE]; /* return buffer */
/* copy the word to the return buffer */
strcpy(sword, word);
/* bubble sort it */
for (i = strlen(sword) - 1; i > 0; i--) {
/* scan through the list once */
swapflag = FALSE;
for (j = 0; j < i; j++) {
/* compare */
if (sword[j] > sword[j + 1]) {
temp = sword[j];
sword[j] = sword[j + 1];
sword[j + 1] = temp;
swapflag = TRUE;
}
}
/* if no movement occurred, we are done early */
if (swapflag == FALSE)
break;
}
return(sword);
}
/* intersect: find all letters occuring in 2 words */
char *intersect(word1, word2)
char *word1; /* first word to intersect with */
char *word2; /* second word */
{
register char *iptr; /* ptr into return buffer */
static char rbuf[MWSIZE]; /* return buffer */
/* scan through both words */
iptr = rbuf;
while (*word1 && *word2) {
/* if they are the same, record it */
if (*word1 == *word2) {
*iptr = *word1++;
++word2;
/* skip duplicates */
while (*word1 == *iptr)
++word1;
while (*word2 == *iptr)
++word2;
++iptr;
continue;
}
/* advance the appropriate word */
if (*word1 < *word2)
++word1;
else
++word2;
}
/* terminate and return the intersection */
*iptr = 0;
return(rbuf);
}
/* weight: using the common letters between two words, apply
bickels weighing function to produce a sameness score
*/
int weight(cletters)
char *cletters; /* common letters between two words */
{
int result; /* resulting weighing factor */
/* scan the letters, add up the result */
result = 0;
while (*cletters)
result += lweight[*cletters++ - 'a'];
return(result);
}
/* How many characters are different between these words? */
int mismatch(word1, word2)
char *word1; /* first word to compare */
char *word2; /* second word to compare */
{
int count; /* count of mismatched characters */
count = 0;
while (*word1) {
/* at the end of word2? */
if (*word2 == 0) {
count += lweight[*word1-'a'];
++word1;
} else {
/* a match, advance */
if (*word1 == *word2) {
++word1;
++word2;
} else {
if (*word1 < *word2) {
count += lweight[*word1-'a'];
++word1;
} else {
count += lweight[*word2-'a'];
++word2;
}
}
}
}
/* past the rest */
while (*word2) {
++count;
count += lweight[*word2-'a'];
++word2;
}
return(count);
}
init_match()
{
register int index;
for (index = 0; index < max_matches; index++) {
matches[index][0] = 0;
match_scores[index] = 1000;
}
}
add_match(word, score)
char *word; /* word to add to match list */
int score; /* score of that word */
{
register int index; /* ptr into match list */
register int rest; /* ptr into rest of list to move */
for (index = 0; index < max_matches; index++) {
if (match_scores[index] > score) {
for (rest = max_matches; rest > index; --rest) {
match_scores[rest] = match_scores[rest-1];
strcpy(matches[rest], matches[rest-1]);
}
match_scores[index] = score;
strcpy(matches[index], word);
return;
}
}
}
show_match()
{
register int index;
register int split;
if (swterse == FALSE) {
for (index = 0; index < max_matches; index++) {
if (match_scores[index] > 999)
return;
if (swverbose)
printf("[%u] %s\n", match_scores[index], matches[index]);
else
printf("%s\n", matches[index]);
}
} else {
split = (max_matches + 2) / 3;
for (index = 0; index < split; index++) {
printf("%20s %20s %20s \n",
matches[index],
matches[index + split],
matches[index + split * 2]);
}
}
}